Required setup
We first need to prepare our environment with the necessary packages and set a global theme for publishable plots in ggplot()
# Sets the repository to download packages from
options(repos = list(CRAN = "http://cran.rstudio.com/"))
# Sets the number of significant figures to two - e.g., 0.01
options(digits = 2)
# Required package for quick package downloading and loading
install.packages("pacman")
##
## The downloaded binary packages are in
## /var/folders/c0/w15tbmpn36d7jj_tpjm_sz9c0000gn/T//Rtmpzwjzmn/downloaded_packages
# Downloads and load required packages
pacman::p_load(dlookr,
formattable,
ggdist,
ggpubr,
ggridges,
kableExtra,
knitr,
nycflights13,
papeR,
RColorBrewer,
Stat2Data,
tidyverse)
# This is basically saying: set the base theme for my ggplots() to theme_pubr()
# ggpubr is a package that helps create publishable ggplots() and theme_pubr() cuts a lot of steps in customizing a ggplot()
theme_set(theme_pubr())
1.0 Load and examine a data set
- Load data and view
- Examine columns and data types
- Define box plots
- Describe meta data
# Let's load a data set from the flights data set
data("flights")
# What does the data look like?
formattable(head(flights))
|
year
|
month
|
day
|
dep_time
|
sched_dep_time
|
dep_delay
|
arr_time
|
sched_arr_time
|
arr_delay
|
carrier
|
flight
|
tailnum
|
origin
|
dest
|
air_time
|
distance
|
hour
|
minute
|
time_hour
|
|
2013
|
1
|
1
|
517
|
515
|
2
|
830
|
819
|
11
|
UA
|
1545
|
N14228
|
EWR
|
IAH
|
227
|
1400
|
5
|
15
|
2013-01-01 05:00:00
|
|
2013
|
1
|
1
|
533
|
529
|
4
|
850
|
830
|
20
|
UA
|
1714
|
N24211
|
LGA
|
IAH
|
227
|
1416
|
5
|
29
|
2013-01-01 05:00:00
|
|
2013
|
1
|
1
|
542
|
540
|
2
|
923
|
850
|
33
|
AA
|
1141
|
N619AA
|
JFK
|
MIA
|
160
|
1089
|
5
|
40
|
2013-01-01 05:00:00
|
|
2013
|
1
|
1
|
544
|
545
|
-1
|
1004
|
1022
|
-18
|
B6
|
725
|
N804JB
|
JFK
|
BQN
|
183
|
1576
|
5
|
45
|
2013-01-01 05:00:00
|
|
2013
|
1
|
1
|
554
|
600
|
-6
|
812
|
837
|
-25
|
DL
|
461
|
N668DN
|
LGA
|
ATL
|
116
|
762
|
6
|
0
|
2013-01-01 06:00:00
|
|
2013
|
1
|
1
|
554
|
558
|
-4
|
740
|
728
|
12
|
UA
|
1696
|
N39463
|
EWR
|
ORD
|
150
|
719
|
5
|
58
|
2013-01-01 05:00:00
|
1.1 Diagnose your data
# What are the properties of the data
formattable(diagnose(flights))
|
variables
|
types
|
missing_count
|
missing_percent
|
unique_count
|
unique_rate
|
|
year
|
integer
|
0
|
0.00
|
1
|
3.0e-06
|
|
month
|
integer
|
0
|
0.00
|
12
|
3.6e-05
|
|
day
|
integer
|
0
|
0.00
|
31
|
9.2e-05
|
|
dep_time
|
integer
|
8255
|
2.45
|
1319
|
3.9e-03
|
|
sched_dep_time
|
integer
|
0
|
0.00
|
1021
|
3.0e-03
|
|
dep_delay
|
numeric
|
8255
|
2.45
|
528
|
1.6e-03
|
|
arr_time
|
integer
|
8713
|
2.59
|
1412
|
4.2e-03
|
|
sched_arr_time
|
integer
|
0
|
0.00
|
1163
|
3.5e-03
|
|
arr_delay
|
numeric
|
9430
|
2.80
|
578
|
1.7e-03
|
|
carrier
|
character
|
0
|
0.00
|
16
|
4.8e-05
|
|
flight
|
integer
|
0
|
0.00
|
3844
|
1.1e-02
|
|
tailnum
|
character
|
2512
|
0.75
|
4044
|
1.2e-02
|
|
origin
|
character
|
0
|
0.00
|
3
|
8.9e-06
|
|
dest
|
character
|
0
|
0.00
|
105
|
3.1e-04
|
|
air_time
|
numeric
|
9430
|
2.80
|
510
|
1.5e-03
|
|
distance
|
numeric
|
0
|
0.00
|
214
|
6.4e-04
|
|
hour
|
numeric
|
0
|
0.00
|
20
|
5.9e-05
|
|
minute
|
numeric
|
0
|
0.00
|
60
|
1.8e-04
|
|
time_hour
|
POSIXct
|
0
|
0.00
|
6936
|
2.1e-02
|
variables: name of each variable
types: data type of each variable
missing_count: number of missing values
missing_percent: percentage of missing values
unique_count: number of unique values
unique_rate: rate of unique value - unique_count / number of observations
Credit to Cédric Scherer
1.2 Summary statistics of your data
Numerical variables
formattable(diagnose_numeric(flights))
|
variables
|
min
|
Q1
|
mean
|
median
|
Q3
|
max
|
zero
|
minus
|
outlier
|
|
year
|
2013
|
2013
|
2013.0
|
2013
|
2013
|
2013
|
0
|
0
|
0
|
|
month
|
1
|
4
|
6.5
|
7
|
10
|
12
|
0
|
0
|
0
|
|
day
|
1
|
8
|
15.7
|
16
|
23
|
31
|
0
|
0
|
0
|
|
dep_time
|
1
|
907
|
1349.1
|
1401
|
1744
|
2400
|
0
|
0
|
0
|
|
sched_dep_time
|
106
|
906
|
1344.3
|
1359
|
1729
|
2359
|
0
|
0
|
0
|
|
dep_delay
|
-43
|
-5
|
12.6
|
-2
|
11
|
1301
|
16514
|
183575
|
43216
|
|
arr_time
|
1
|
1104
|
1502.1
|
1535
|
1940
|
2400
|
0
|
0
|
0
|
|
sched_arr_time
|
1
|
1124
|
1536.4
|
1556
|
1945
|
2359
|
0
|
0
|
0
|
|
arr_delay
|
-86
|
-17
|
6.9
|
-5
|
14
|
1272
|
5409
|
188933
|
27880
|
|
flight
|
1
|
553
|
1971.9
|
1496
|
3465
|
8500
|
0
|
0
|
1
|
|
air_time
|
20
|
82
|
150.7
|
129
|
192
|
695
|
0
|
0
|
5448
|
|
distance
|
17
|
502
|
1039.9
|
872
|
1389
|
4983
|
0
|
0
|
715
|
|
hour
|
1
|
9
|
13.2
|
13
|
17
|
23
|
0
|
0
|
0
|
|
minute
|
0
|
8
|
26.2
|
29
|
44
|
59
|
60696
|
0
|
0
|
min: minimum value
Q1: 1/4 quartile, 25th percentile
mean: arithmetic mean (average value)
median: median, 50th percentile
Q3: 3/4 quartile, 75th percentile
max: maximum value
zero: number of observations with the value 0
minus: number of observations with negative numbers
outlier: number of outliers
Diagnose outliers and missing values
Categorical variables
formattable(diagnose_category(flights))
|
variables
|
levels
|
N
|
freq
|
ratio
|
rank
|
|
carrier
|
UA
|
336776
|
58665
|
17.420
|
1
|
|
carrier
|
B6
|
336776
|
54635
|
16.223
|
2
|
|
carrier
|
EV
|
336776
|
54173
|
16.086
|
3
|
|
carrier
|
DL
|
336776
|
48110
|
14.285
|
4
|
|
carrier
|
AA
|
336776
|
32729
|
9.718
|
5
|
|
carrier
|
MQ
|
336776
|
26397
|
7.838
|
6
|
|
carrier
|
US
|
336776
|
20536
|
6.098
|
7
|
|
carrier
|
9E
|
336776
|
18460
|
5.481
|
8
|
|
carrier
|
WN
|
336776
|
12275
|
3.645
|
9
|
|
carrier
|
VX
|
336776
|
5162
|
1.533
|
10
|
|
tailnum
|
NA
|
336776
|
2512
|
0.746
|
1
|
|
tailnum
|
N725MQ
|
336776
|
575
|
0.171
|
2
|
|
tailnum
|
N722MQ
|
336776
|
513
|
0.152
|
3
|
|
tailnum
|
N723MQ
|
336776
|
507
|
0.151
|
4
|
|
tailnum
|
N711MQ
|
336776
|
486
|
0.144
|
5
|
|
tailnum
|
N713MQ
|
336776
|
483
|
0.143
|
6
|
|
tailnum
|
N258JB
|
336776
|
427
|
0.127
|
7
|
|
tailnum
|
N298JB
|
336776
|
407
|
0.121
|
8
|
|
tailnum
|
N353JB
|
336776
|
404
|
0.120
|
9
|
|
tailnum
|
N351JB
|
336776
|
402
|
0.119
|
10
|
|
origin
|
EWR
|
336776
|
120835
|
35.880
|
1
|
|
origin
|
JFK
|
336776
|
111279
|
33.042
|
2
|
|
origin
|
LGA
|
336776
|
104662
|
31.078
|
3
|
|
dest
|
ORD
|
336776
|
17283
|
5.132
|
1
|
|
dest
|
ATL
|
336776
|
17215
|
5.112
|
2
|
|
dest
|
LAX
|
336776
|
16174
|
4.803
|
3
|
|
dest
|
BOS
|
336776
|
15508
|
4.605
|
4
|
|
dest
|
MCO
|
336776
|
14082
|
4.181
|
5
|
|
dest
|
CLT
|
336776
|
14064
|
4.176
|
6
|
|
dest
|
SFO
|
336776
|
13331
|
3.958
|
7
|
|
dest
|
FLL
|
336776
|
12055
|
3.580
|
8
|
|
dest
|
MIA
|
336776
|
11728
|
3.482
|
9
|
|
dest
|
DCA
|
336776
|
9705
|
2.882
|
10
|
|
time_hour
|
2013-09-13 08:00:00
|
336776
|
94
|
0.028
|
1
|
|
time_hour
|
2013-09-20 08:00:00
|
336776
|
94
|
0.028
|
1
|
|
time_hour
|
2013-09-09 08:00:00
|
336776
|
93
|
0.028
|
3
|
|
time_hour
|
2013-09-16 08:00:00
|
336776
|
93
|
0.028
|
3
|
|
time_hour
|
2013-09-23 08:00:00
|
336776
|
93
|
0.028
|
3
|
|
time_hour
|
2013-09-19 08:00:00
|
336776
|
92
|
0.027
|
6
|
|
time_hour
|
2013-10-11 08:00:00
|
336776
|
92
|
0.027
|
6
|
|
time_hour
|
2013-09-10 08:00:00
|
336776
|
91
|
0.027
|
8
|
|
time_hour
|
2013-09-12 08:00:00
|
336776
|
91
|
0.027
|
8
|
|
time_hour
|
2013-09-17 08:00:00
|
336776
|
91
|
0.027
|
8
|
variables: category names
levels: group names within categories
N: number of observation
freq: number of observation at group level / number of observation at category level
ratio: percentage of observation at group level / number of observation at category level
rank: rank of the occupancy ratio of levels (order in which the groups are in the category)
Missing values (NAs)
- Table showing the extent of NAs in columns containing them
- Plot showing the frequency of missing values
# Create the NA table
NA.Table <- plot_na_pareto(flights, only_na = TRUE, plot = FALSE)
# Publishable table
formattable(NA.Table)
|
variable
|
frequencies
|
ratio
|
grade
|
cumulative
|
|
air_time
|
9430
|
0.0280
|
Good
|
20
|
|
arr_delay
|
9430
|
0.0280
|
Good
|
40
|
|
arr_time
|
8713
|
0.0259
|
Good
|
59
|
|
dep_delay
|
8255
|
0.0245
|
Good
|
77
|
|
dep_time
|
8255
|
0.0245
|
Good
|
95
|
|
tailnum
|
2512
|
0.0075
|
Good
|
100
|
# Plot the intersect of the columns with the most missing values
# This means that some combinations of columns have missing values in the same row
plot_na_intersect(flights, only_na = TRUE)

Produce an HTML summary of a data set
#diagnose_web_report(flights)